/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: November 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_full_entities.dta
			pdb_analysis.dta
			phat.dta
			
Outputs:	Tables 5, E8, E9

Purpose: 	Test whether competition is associated with lower quality
			Instrument for competition with potential and human indicator

*******************************************************************************/

* Start by aggregating taxonomy up to the structure level

	clear all
	use "${data_built}pdb_full_entities.dta", clear
	keep structureId taxonomy 
	
	* clean up taxonomy, keep only the mode within structure
	split taxonomy, parse(#)
	replace taxonomy1 = lower(taxonomy1)
	drop taxonomy
	rename taxonomy1 taxonomy
	bys structureId: egen modeTaxonomy = mode(taxonomy), minmode
	collapse (first) modeTaxonomy, by(structureId)
	rename modeTaxonomy taxonomy
	
	tempfile taxonomy
	save `taxonomy'
	
* Now assemble the usual data, merging in taxonomy to structure-level
	
	use "${data_built}pdb_full_structures.dta"
	
	* Merge in taxonomy
	merge 1:1 structureId using `taxonomy'
	assert _merge == 3
	drop _merge
		
	* Merge in the analysis sample	
	merge 1:1 structureId using "${data_built}pdb_analysis.dta"
	assert _merge != 2
	keep if _merge == 3
	drop _merge

	* Merge in the paper-level data
	merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
	keep if _merge != 2
	drop _merge

	* Merge in the potential
	merge 1:1 structureId using "${data_built}phat.dta"
	assert _merge == 3
	drop _merge
	
	* Keep non-SG structures
	keep if structuralGenomics == 0
	
	* Generate dummies for most common taxonomies
	gen human = (taxonomy == "homo sapiens")
	gen ecoli = (taxonomy == "escherichia coli")
	gen mouse = (taxonomy == "mus musculus")
	gen yeast = (taxonomy == "saccharomyces cerevisiae")
	gen haybacillus = (taxonomy == "bacillus subtilis")


/*------------------------------------------------------------------------------

	Table 5: Effect of Competition on Maturation and Quality

------------------------------------------------------------------------------*/

* Initialize matrix to store results
matrix comp_results = J(13,2,.)
local row = 1
local col = 1

foreach Q in maturation indexStd {
		
		* OLS regression
		reg `Q' priorityRace $time_controls $complexity_controls, r	
			matrix comp_results[`row',`col'] = _b[priorityRace]
			local ++row
			matrix comp_results[`row',`col'] = _se[priorityRace]
			local ++row
			local t = _b[priorityRace] / _se[priorityRace]
			matrix comp_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
			local ++row
		
		* IV regression (human)
		ivreg2 `Q' $time_controls $complexity_controls 						///
		(priorityRace = human), r first
			matrix comp_results[`row',`col'] = _b[priorityRace]
			local ++row
			matrix comp_results[`row',`col'] = _se[priorityRace]
			local ++row
			local z = _b[priorityRace] / _se[priorityRace]
			matrix comp_results[`row', `col'] =  2*normal(-abs(`z'))
			local ++row
			
			weakivtest // get the robust F-stat (Pflueger & Wang)
			matrix comp_results[`row', `col'] = r(F_eff)
			local ++row
		
		* IV regression (potential)
		ivreg2 `Q' $time_controls $complexity_controls 						///
		(priorityRace = predPtileCites3), r first
			matrix comp_results[`row',`col'] = _b[priorityRace]
			local ++row
			matrix comp_results[`row',`col'] = _se[priorityRace]
			local ++row
			local z = _b[priorityRace] / _se[priorityRace]
			matrix comp_results[`row', `col'] =  2*normal(-abs(`z'))
			local ++row
			
			weakivtest // get the robust F-stat (Pflueger & Wang)
			matrix comp_results[`row', `col'] = r(F_eff)
			local ++row
		
		sum `Q'
		matrix comp_results[`row', `col'] = r(mean)
		local ++row
		matrix comp_results[`row', `col'] = r(N)
		local row = 1
		local ++col
}

* Save regression results
preserve
	clear
	svmat comp_results
	export excel "${tables}Tables.xlsx", sheet(table5) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Table E8: First Stage Results for Most Common Species

------------------------------------------------------------------------------*/

* Most common taxonomies include homo sapiens (4,614); e. coli (1,143); 
* mus musculus (873); saccharomyces cerevisiae (643); bacillus subtilis (328)
gen temp = 1
bys taxonomy: egen taxonomyCount = sum(temp)
drop temp
gsort -taxonomyCount
	
* Initialize matrix to store first stage results
matrix FS_regs = J(6,5,.)
local row = 1
local col = 1
	
* Run first stages
foreach tax in human ecoli mouse yeast haybacillus {
		
	* Run first stage, store coefficient, SE, p-value
	qui reg priorityRace `tax' $time_controls $complexity_controls, r
		matrix FS_regs[`row',`col'] = _b[`tax']
		local ++row
		matrix FS_regs[`row',`col'] = _se[`tax']
		local ++row
		local t = _b[`tax'] / _se[`tax']
		matrix FS_regs[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		
	* Run 2SLS and weak IV test to store the (robust) F-stat of instrument 
	qui ivreg2 indexStd $time_controls $complexity_controls 				///
	(priorityRace = `tax'), r first partial($time_controls)
		
		weakivtest // get the robust F-stat (Pflueger & Wang)
		matrix FS_regs[`row', `col'] = r(F_eff)
		local ++row
		
	* Add counts
	sum `tax'
	matrix FS_regs[`row', `col'] = r(mean)*r(N)
	local ++row
	matrix FS_regs[`row', `col'] = r(N)
	local ++col
	local row = 1
		
}
	
* Save results
preserve
	clear
	svmat FS_regs
	export excel "${tables}Tables.xlsx", sheet(tableE8) cell(D60) sheetmodify
restore	
	
/*------------------------------------------------------------------------------

	Table E9: Assessing Balance Between Non-Human and Human Structures

------------------------------------------------------------------------------*/

* Initialize matrix to store balance
matrix balance = J(4,3,.)
local row = 1
local col = 1
	
* Loop over complexity variables
foreach v in lnStructureMolecularWt lnResidueCount lnAtomSiteCount {
	qui reg `v' human, r
		matrix balance[`row', `col'] = _b[_cons]
		local ++col
		local x = _b[_cons] + _b[human]
		matrix balance[`row', `col'] = `x'
		local ++col
		local t = _b[human] / _se[human]
		matrix balance[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		local col = 1
		
}
	
* Add sample counts
sum human
matrix balance[`row', `col'] = (1-r(mean))*r(N)
local ++col
matrix balance[`row', `col'] = r(mean)*r(N)
	
* Save results
preserve
	clear
	svmat balance
	export excel "${tables}Tables.xlsx", sheet(tableE9) cell(D60) sheetmodify
restore	
	

	



